Project

In [1]:
! pip install plotly_express
! pip install geopandas
Collecting plotly_express
  Downloading plotly_express-0.4.1-py2.py3-none-any.whl (2.9 kB)
Requirement already satisfied: statsmodels>=0.9.0 in /usr/local/lib/python3.7/dist-packages (from plotly_express) (0.10.2)
Requirement already satisfied: plotly>=4.1.0 in /usr/local/lib/python3.7/dist-packages (from plotly_express) (5.5.0)
Requirement already satisfied: numpy>=1.11 in /usr/local/lib/python3.7/dist-packages (from plotly_express) (1.21.5)
Requirement already satisfied: pandas>=0.20.0 in /usr/local/lib/python3.7/dist-packages (from plotly_express) (1.3.5)
Requirement already satisfied: scipy>=0.18 in /usr/local/lib/python3.7/dist-packages (from plotly_express) (1.4.1)
Requirement already satisfied: patsy>=0.5 in /usr/local/lib/python3.7/dist-packages (from plotly_express) (0.5.2)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.20.0->plotly_express) (2.8.2)
Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.20.0->plotly_express) (2018.9)
Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from patsy>=0.5->plotly_express) (1.15.0)
Requirement already satisfied: tenacity>=6.2.0 in /usr/local/lib/python3.7/dist-packages (from plotly>=4.1.0->plotly_express) (8.0.1)
Installing collected packages: plotly-express
Successfully installed plotly-express-0.4.1
Collecting geopandas
  Downloading geopandas-0.10.2-py2.py3-none-any.whl (1.0 MB)
     |████████████████████████████████| 1.0 MB 9.5 MB/s 
Requirement already satisfied: shapely>=1.6 in /usr/local/lib/python3.7/dist-packages (from geopandas) (1.8.1.post1)
Collecting fiona>=1.8
  Downloading Fiona-1.8.21-cp37-cp37m-manylinux2014_x86_64.whl (16.7 MB)
     |████████████████████████████████| 16.7 MB 257 kB/s 
Collecting pyproj>=2.2.0
  Downloading pyproj-3.2.1-cp37-cp37m-manylinux2010_x86_64.whl (6.3 MB)
     |████████████████████████████████| 6.3 MB 57.8 MB/s 
Requirement already satisfied: pandas>=0.25.0 in /usr/local/lib/python3.7/dist-packages (from geopandas) (1.3.5)
Collecting munch
  Downloading munch-2.5.0-py2.py3-none-any.whl (10 kB)
Requirement already satisfied: certifi in /usr/local/lib/python3.7/dist-packages (from fiona>=1.8->geopandas) (2021.10.8)
Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from fiona>=1.8->geopandas) (57.4.0)
Requirement already satisfied: six>=1.7 in /usr/local/lib/python3.7/dist-packages (from fiona>=1.8->geopandas) (1.15.0)
Requirement already satisfied: attrs>=17 in /usr/local/lib/python3.7/dist-packages (from fiona>=1.8->geopandas) (21.4.0)
Collecting cligj>=0.5
  Downloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Collecting click-plugins>=1.0
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Requirement already satisfied: click>=4.0 in /usr/local/lib/python3.7/dist-packages (from fiona>=1.8->geopandas) (7.1.2)
Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.25.0->geopandas) (1.21.5)
Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.25.0->geopandas) (2018.9)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.25.0->geopandas) (2.8.2)
Installing collected packages: munch, cligj, click-plugins, pyproj, fiona, geopandas
Successfully installed click-plugins-1.1.1 cligj-0.7.2 fiona-1.8.21 geopandas-0.10.2 munch-2.5.0 pyproj-3.2.1
In [2]:
#Importing the necessary libraries
import matplotlib.pyplot as plt 
import matplotlib.image as mpimg
import numpy as np
import os 
import pandas as pd 
import seaborn as sns
import math
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from pandas.plotting import scatter_matrix
from geopy.distance import vincenty
from geopy.geocoders import Nominatim
import folium
from folium import Marker
import geopandas as gpd
import plotly_express as px
from xgboost.sklearn import XGBRegressor
import gc
from sklearn.pipeline import Pipeline
from scipy.stats import zscore
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score,RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score
In [3]:
sns.set(rc={'figure.figsize':(11,8)})
'''Suppress Scientific Notation '''
pd.options.display.float_format = '{:.2f}'.format
In [4]:
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
In [5]:
#reading the data as a data frame
path = "/content/drive/MyDrive/ML Capstone Project - 2022/Data/innercity.csv"
data_city=pd. read_csv(path)
print (data_city.head())
print (data_city.shape)
          cid         dayhours   price  room_bed  room_bath  living_measure  \
0  3034200666  20141107T000000  808100         4       3.25            3020   
1  8731981640  20141204T000000  277500         4       2.50            2550   
2  5104530220  20150420T000000  404000         3       2.50            2370   
3  6145600285  20140529T000000  300000         2       1.00             820   
4  8924100111  20150424T000000  699000         2       1.50            1400   

   lot_measure  ceil  coast  sight  ...  basement  yr_built  yr_renovated  \
0        13457  1.00      0      0  ...         0      1956             0   
1         7500  1.00      0      0  ...       800      1976             0   
2         4324  2.00      0      0  ...         0      2006             0   
3         3844  1.00      0      0  ...         0      1916             0   
4         4050  1.00      0      0  ...         0      1954             0   

   zipcode   lat    long  living_measure15  lot_measure15  furnished  \
0    98133 47.72 -122.34              2120           7553          1   
1    98023 47.32 -122.39              2260           8800          0   
2    98038 47.35 -122.00              2370           4348          0   
3    98133 47.70 -122.35              1520           3844          0   
4    98115 47.68 -122.27              1900           5940          0   

   total_area  
0       16477  
1       10050  
2        6694  
3        4664  
4        5450  

[5 rows x 23 columns]
(21613, 23)
In [6]:
print(data_city.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   cid               21613 non-null  int64  
 1   dayhours          21613 non-null  object 
 2   price             21613 non-null  int64  
 3   room_bed          21613 non-null  int64  
 4   room_bath         21613 non-null  float64
 5   living_measure    21613 non-null  int64  
 6   lot_measure       21613 non-null  int64  
 7   ceil              21613 non-null  float64
 8   coast             21613 non-null  int64  
 9   sight             21613 non-null  int64  
 10  condition         21613 non-null  int64  
 11  quality           21613 non-null  int64  
 12  ceil_measure      21613 non-null  int64  
 13  basement          21613 non-null  int64  
 14  yr_built          21613 non-null  int64  
 15  yr_renovated      21613 non-null  int64  
 16  zipcode           21613 non-null  int64  
 17  lat               21613 non-null  float64
 18  long              21613 non-null  float64
 19  living_measure15  21613 non-null  int64  
 20  lot_measure15     21613 non-null  int64  
 21  furnished         21613 non-null  int64  
 22  total_area        21613 non-null  int64  
dtypes: float64(4), int64(18), object(1)
memory usage: 3.8+ MB
None
In [7]:
data_city.describe().transpose()
Out[7]:
count mean std min 25% 50% 75% max
cid 21613.00 4580301520.86 2876565571.31 1000102.00 2123049194.00 3904930410.00 7308900445.00 9900000190.00
price 21613.00 540182.16 367362.23 75000.00 321950.00 450000.00 645000.00 7700000.00
room_bed 21613.00 3.37 0.93 0.00 3.00 3.00 4.00 33.00
room_bath 21613.00 2.11 0.77 0.00 1.75 2.25 2.50 8.00
living_measure 21613.00 2079.90 918.44 290.00 1427.00 1910.00 2550.00 13540.00
lot_measure 21613.00 15106.97 41420.51 520.00 5040.00 7618.00 10688.00 1651359.00
ceil 21613.00 1.49 0.54 1.00 1.00 1.50 2.00 3.50
coast 21613.00 0.01 0.09 0.00 0.00 0.00 0.00 1.00
sight 21613.00 0.23 0.77 0.00 0.00 0.00 0.00 4.00
condition 21613.00 3.41 0.65 1.00 3.00 3.00 4.00 5.00
quality 21613.00 7.66 1.18 1.00 7.00 7.00 8.00 13.00
ceil_measure 21613.00 1788.39 828.09 290.00 1190.00 1560.00 2210.00 9410.00
basement 21613.00 291.51 442.58 0.00 0.00 0.00 560.00 4820.00
yr_built 21613.00 1971.01 29.37 1900.00 1951.00 1975.00 1997.00 2015.00
yr_renovated 21613.00 84.40 401.68 0.00 0.00 0.00 0.00 2015.00
zipcode 21613.00 98077.94 53.51 98001.00 98033.00 98065.00 98118.00 98199.00
lat 21613.00 47.56 0.14 47.16 47.47 47.57 47.68 47.78
long 21613.00 -122.21 0.14 -122.52 -122.33 -122.23 -122.12 -121.31
living_measure15 21613.00 1986.55 685.39 399.00 1490.00 1840.00 2360.00 6210.00
lot_measure15 21613.00 12768.46 27304.18 651.00 5100.00 7620.00 10083.00 871200.00
furnished 21613.00 0.20 0.40 0.00 0.00 0.00 0.00 1.00
total_area 21613.00 17186.87 41589.08 1423.00 7035.00 9575.00 13000.00 1652659.00
In [8]:
data_city.isnull().any()
Out[8]:
cid                 False
dayhours            False
price               False
room_bed            False
room_bath           False
living_measure      False
lot_measure         False
ceil                False
coast               False
sight               False
condition           False
quality             False
ceil_measure        False
basement            False
yr_built            False
yr_renovated        False
zipcode             False
lat                 False
long                False
living_measure15    False
lot_measure15       False
furnished           False
total_area          False
dtype: bool
In [9]:
''' Quick confirmation along with the above one'''
data_city.isna().sum()
Out[9]:
cid                 0
dayhours            0
price               0
room_bed            0
room_bath           0
living_measure      0
lot_measure         0
ceil                0
coast               0
sight               0
condition           0
quality             0
ceil_measure        0
basement            0
yr_built            0
yr_renovated        0
zipcode             0
lat                 0
long                0
living_measure15    0
lot_measure15       0
furnished           0
total_area          0
dtype: int64
In [10]:
data_city.duplicated().sum()
Out[10]:
0

There are no nulls or duplicates present in in the data frame. Now lets check on the correlation features

In [11]:
data_city.corr()
Out[11]:
cid price room_bed room_bath living_measure lot_measure ceil coast sight condition ... basement yr_built yr_renovated zipcode lat long living_measure15 lot_measure15 furnished total_area
cid 1.00 -0.02 0.00 0.01 -0.01 -0.13 0.02 -0.00 0.01 -0.02 ... -0.01 0.02 -0.02 -0.01 -0.00 0.02 -0.00 -0.14 -0.01 -0.13
price -0.02 1.00 0.31 0.53 0.70 0.09 0.26 0.27 0.40 0.04 ... 0.32 0.05 0.13 -0.05 0.31 0.02 0.59 0.08 0.57 0.10
room_bed 0.00 0.31 1.00 0.52 0.58 0.03 0.18 -0.01 0.08 0.03 ... 0.30 0.15 0.02 -0.15 -0.01 0.13 0.39 0.03 0.26 0.04
room_bath 0.01 0.53 0.52 1.00 0.75 0.09 0.50 0.06 0.19 -0.12 ... 0.28 0.51 0.05 -0.20 0.02 0.22 0.57 0.09 0.48 0.10
living_measure -0.01 0.70 0.58 0.75 1.00 0.17 0.35 0.10 0.28 -0.06 ... 0.44 0.32 0.06 -0.20 0.05 0.24 0.76 0.18 0.63 0.19
lot_measure -0.13 0.09 0.03 0.09 0.17 1.00 -0.01 0.02 0.07 -0.01 ... 0.02 0.05 0.01 -0.13 -0.09 0.23 0.14 0.72 0.12 1.00
ceil 0.02 0.26 0.18 0.50 0.35 -0.01 1.00 0.02 0.03 -0.26 ... -0.25 0.49 0.01 -0.06 0.05 0.13 0.28 -0.01 0.35 0.00
coast -0.00 0.27 -0.01 0.06 0.10 0.02 0.02 1.00 0.40 0.02 ... 0.08 -0.03 0.09 0.03 -0.01 -0.04 0.09 0.03 0.07 0.02
sight 0.01 0.40 0.08 0.19 0.28 0.07 0.03 0.40 1.00 0.05 ... 0.28 -0.05 0.10 0.08 0.01 -0.08 0.28 0.07 0.22 0.08
condition -0.02 0.04 0.03 -0.12 -0.06 -0.01 -0.26 0.02 0.05 1.00 ... 0.17 -0.36 -0.06 0.00 -0.01 -0.11 -0.09 -0.00 -0.12 -0.01
quality 0.01 0.67 0.36 0.66 0.76 0.11 0.46 0.08 0.25 -0.14 ... 0.17 0.45 0.01 -0.18 0.11 0.20 0.71 0.12 0.79 0.13
ceil_measure -0.01 0.61 0.48 0.69 0.88 0.18 0.52 0.07 0.17 -0.16 ... -0.05 0.42 0.02 -0.26 -0.00 0.34 0.73 0.19 0.65 0.20
basement -0.01 0.32 0.30 0.28 0.44 0.02 -0.25 0.08 0.28 0.17 ... 1.00 -0.13 0.07 0.07 0.11 -0.14 0.20 0.02 0.09 0.02
yr_built 0.02 0.05 0.15 0.51 0.32 0.05 0.49 -0.03 -0.05 -0.36 ... -0.13 1.00 -0.22 -0.35 -0.15 0.41 0.33 0.07 0.31 0.06
yr_renovated -0.02 0.13 0.02 0.05 0.06 0.01 0.01 0.09 0.10 -0.06 ... 0.07 -0.22 1.00 0.06 0.03 -0.07 -0.00 0.01 0.02 0.01
zipcode -0.01 -0.05 -0.15 -0.20 -0.20 -0.13 -0.06 0.03 0.08 0.00 ... 0.07 -0.35 0.06 1.00 0.27 -0.56 -0.28 -0.15 -0.14 -0.13
lat -0.00 0.31 -0.01 0.02 0.05 -0.09 0.05 -0.01 0.01 -0.01 ... 0.11 -0.15 0.03 0.27 1.00 -0.14 0.05 -0.09 0.08 -0.08
long 0.02 0.02 0.13 0.22 0.24 0.23 0.13 -0.04 -0.08 -0.11 ... -0.14 0.41 -0.07 -0.56 -0.14 1.00 0.33 0.25 0.19 0.23
living_measure15 -0.00 0.59 0.39 0.57 0.76 0.14 0.28 0.09 0.28 -0.09 ... 0.20 0.33 -0.00 -0.28 0.05 0.33 1.00 0.18 0.62 0.16
lot_measure15 -0.14 0.08 0.03 0.09 0.18 0.72 -0.01 0.03 0.07 -0.00 ... 0.02 0.07 0.01 -0.15 -0.09 0.25 0.18 1.00 0.13 0.72
furnished -0.01 0.57 0.26 0.48 0.63 0.12 0.35 0.07 0.22 -0.12 ... 0.09 0.31 0.02 -0.14 0.08 0.19 0.62 0.13 1.00 0.13
total_area -0.13 0.10 0.04 0.10 0.19 1.00 0.00 0.02 0.08 -0.01 ... 0.02 0.06 0.01 -0.13 -0.08 0.23 0.16 0.72 0.13 1.00

22 rows × 22 columns

In [ ]:
plt.figure(figsize= (18,15))
sns.heatmap(data_city.corr())
#plotting a heatmap for the correlation
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fe94b8c42d0>

Univariate Analysis with Box Plot to see if there are any identifiers

In [ ]:
# let's boxplot all the numerical columns and see if there any outliers
for i in data_city.columns:
  try:
    data_city.iloc[:, 1:].boxplot(column=i)
    plt.show()
  except Exception as e :
    print (e)
"None of [Index(['cid'], dtype='object')] are in the [columns]"
"None of [Index(['dayhours'], dtype='object')] are in the [columns]"

Plot the distribution of all the columns to identify how the variables are placed

In [ ]:
#print(data_city.columns)   
columns = (data_city.select_dtypes(include=np.number).columns.tolist())
''' Put them as table'''

for col in data_city.columns:
    
    hist = data_city[col].hist(bins=10)
    print("Distribution Plot for  {0}".format(col))
    plt.show()
Distribution Plot for  cid
Distribution Plot for  dayhours
Distribution Plot for  price
Distribution Plot for  room_bed
Distribution Plot for  room_bath
Distribution Plot for  living_measure
Distribution Plot for  lot_measure
Distribution Plot for  ceil
Distribution Plot for  coast
Distribution Plot for  sight
Distribution Plot for  condition
Distribution Plot for  quality
Distribution Plot for  ceil_measure
Distribution Plot for  basement
Distribution Plot for  yr_built
Distribution Plot for  yr_renovated
Distribution Plot for  zipcode
Distribution Plot for  lat
Distribution Plot for  long
Distribution Plot for  living_measure15
Distribution Plot for  lot_measure15
Distribution Plot for  furnished
Distribution Plot for  total_area
In [ ]:

Pair plot: Matrix of scatterplots that lets you understand the pairwise relationship between different variables in a dataset.

Bivariate Analysis

In [ ]:
plt.figure(figsize= (75,50))
sns.pairplot(data_city)
''' Divide into multiple columns'''
Out[ ]:
' Divide into multiple columns'
<Figure size 5400x3600 with 0 Axes>
  • price: price distribution is Right-Skewed as we deduced earlier from our 5-factor analysis
  • room_bed: our target variable (price) and room_bed plot is not linear. It's distribution have lot of gaussians
  • room_bath: It's plot with price has somewhat linear relationship. Distribution has number of gaussians.
  • living_measure: Plot against price has strong linear relationship. It also have linear relationship with room_bath variable. So might remove one of these 2. Distribution is Right-Skewed.
  • lot_measure: No clear relationship with price.
  • ceil: No clear relationship with price. We can see, it's have 6 unique * values only. Therefore, we can convert this column into categorical column for values.
  • coast: No clear relationship with price. Clearly it's categorical variable with 2 unique values.
  • sight: No clear relationship with price. This has 5 unique values. Can be converted to Categorical variable.
  • condition: No clear relationship with price. This has 5 unique values. Can be converted to Categorical variable.
  • quality: Somewhat linear relationship with price. Has discrete values from 1 - 13. Can be converted to Categorical variable.
  • ceil_measure: Strong linear relationship with price. Also with room_bath and living_measure features. Distribution is Right-Skewed.
  • basement: No clear relationship with price.
  • yr_built: No clear relationship with price.
  • yr_renovated: No clear relationship with price. Have 2 unique values. Can be converted to Categorical Variable which tells whether house is renovated or not.
  • zipcode, lat, long: No clear relationship with price or any other feature. living_measure15: Somewhat linear relationship with target feature. It's same as living_measure. Therefore we can drop this variable.
  • lot_measure15: No clear relationship with price or any other feature.
  • furnished: No clear relationship with price or any other feature. 2 unique values so can be converted to Categorical Variable
  • total_area: No clear relationship with price. But it has Very Strong linear relationship with lot_measure. So one of it can be dropped.

Check for Null Values

In [ ]:
print(pd.isnull(data_city).any())
cid                 False
dayhours            False
price               False
room_bed            False
room_bath           False
living_measure      False
lot_measure         False
ceil                False
coast               False
sight               False
condition           False
quality             False
ceil_measure        False
basement            False
yr_built            False
yr_renovated        False
zipcode             False
lat                 False
long                False
living_measure15    False
lot_measure15       False
furnished           False
total_area          False
dtype: bool

Check for values which are correlated with the House prices

In [ ]:
plt.figure(figsize = (10,8))
sns.distplot(data_city['price'], kde=False, bins=8)
/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fe8a3ab7f10>

Most of the housing oprices are being quoted at 1 million and goes upto 3 million.

In [12]:
sns.lineplot(x='living_measure', y='price', data=data_city)
Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa11ed87510>

There is a clear correlation between the living area of the house and the price. If the area is too big, the price starts to decrease as there might be few buyers for the big house which are costly to maintain.

Following are few observations which can be seen how the prices are corrlelated to various features.

In [13]:
sns.lineplot(x='yr_built', y='price', data=data_city)
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa11471c210>
In [14]:
sns.lineplot(x='quality', y='price', data=data_city)
Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa11470e4d0>
In [15]:
sns.lineplot(x='room_bed', y='price', data=data_city)
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa1129d54d0>

THere can be seen that there is clear increasing trend with room bed , but some outliers are present though.

In [16]:
sns.lineplot(x='room_bath', y='price', data=data_city)
Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa1129966d0>

There is a clear upward trend in the price with increase in the room_bath.

In [17]:
sns.lineplot(x='condition', y='price', data=data_city)
Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa1128cd610>
In [18]:
sns.lineplot(x="furnished", y="price", data=data_city);
In [19]:
plt.figure(figsize=(15,10))
print(sns.scatterplot(data_city['living_measure'],data_city['price'], hue = data_city['furnished']))
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
AxesSubplot(0.125,0.125;0.775x0.755)

It can be seen that there is a clear increase in the living measure with respect to the price and furnished with outliers.

In [20]:
#lot_measure >100000 - price increases with increase in living measure
plt.figure(figsize=(20, 15))
print(sns.scatterplot(data_city['lot_measure'],data_city['price']))
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
AxesSubplot(0.125,0.125;0.775x0.755)

There are not much of correlation between the lot measure and price.

In [21]:
print(sns.scatterplot(data_city['ceil'],data_city['price']))
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
AxesSubplot(0.125,0.125;0.775x0.755)
In [22]:
print(sns.lineplot(data_city['coast'],data_city['price']))
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
AxesSubplot(0.125,0.125;0.775x0.755)

Houses with water_front tend to have higher price compared to that of non-water_front properties.

In [23]:
renovated = data_city[data_city['yr_renovated'] > 0]
print(sns.scatterplot(renovated['yr_renovated'],data_city['price']))
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
AxesSubplot(0.125,0.125;0.775x0.755)

Many houses are renovated after 1975's tend to have higher price.

In [24]:
print(sns.scatterplot(data_city['lot_measure'],data_city['price'],hue=data_city['furnished']))
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
AxesSubplot(0.125,0.125;0.775x0.755)

Furnished houses have higher price than that of the Non-furnished houses.

In [25]:
sns.scatterplot(data_city['total_area'],data_city['price'],hue=data_city['furnished'])
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
Out[25]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa112681a50>

Total area doesnt have any direct correlation with the price. Can be omitted while doing feature selection.

Create a new column for year Remove the unwanted columns and drop the columns CID and DayHours.

In [26]:
''' 
Making only the year of the sale to be in place.
'''
data_city["Year"] = data_city["dayhours"].apply(lambda x:x[0:4]) 
data_city['Year'] = pd.to_numeric(data_city['Year'])
print (data_city.head())
          cid         dayhours   price  room_bed  room_bath  living_measure  \
0  3034200666  20141107T000000  808100         4       3.25            3020   
1  8731981640  20141204T000000  277500         4       2.50            2550   
2  5104530220  20150420T000000  404000         3       2.50            2370   
3  6145600285  20140529T000000  300000         2       1.00             820   
4  8924100111  20150424T000000  699000         2       1.50            1400   

   lot_measure  ceil  coast  sight  ...  yr_built  yr_renovated  zipcode  \
0        13457  1.00      0      0  ...      1956             0    98133   
1         7500  1.00      0      0  ...      1976             0    98023   
2         4324  2.00      0      0  ...      2006             0    98038   
3         3844  1.00      0      0  ...      1916             0    98133   
4         4050  1.00      0      0  ...      1954             0    98115   

    lat    long  living_measure15  lot_measure15  furnished  total_area  Year  
0 47.72 -122.34              2120           7553          1       16477  2014  
1 47.32 -122.39              2260           8800          0       10050  2014  
2 47.35 -122.00              2370           4348          0        6694  2015  
3 47.70 -122.35              1520           3844          0        4664  2014  
4 47.68 -122.27              1900           5940          0        5450  2015  

[5 rows x 24 columns]
In [27]:
data_city.columns
Out[27]:
Index(['cid', 'dayhours', 'price', 'room_bed', 'room_bath', 'living_measure',
       'lot_measure', 'ceil', 'coast', 'sight', 'condition', 'quality',
       'ceil_measure', 'basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'living_measure15', 'lot_measure15', 'furnished',
       'total_area', 'Year'],
      dtype='object')
In [28]:
'''Remove the unwanted columns'''
data_city.drop( columns = ['cid','dayhours'],inplace = True)

Doesnt have any good correlation betwen the year

In [29]:
data_city.head(5)
Out[29]:
price room_bed room_bath living_measure lot_measure ceil coast sight condition quality ... yr_built yr_renovated zipcode lat long living_measure15 lot_measure15 furnished total_area Year
0 808100 4 3.25 3020 13457 1.00 0 0 5 9 ... 1956 0 98133 47.72 -122.34 2120 7553 1 16477 2014
1 277500 4 2.50 2550 7500 1.00 0 0 3 8 ... 1976 0 98023 47.32 -122.39 2260 8800 0 10050 2014
2 404000 3 2.50 2370 4324 2.00 0 0 3 8 ... 2006 0 98038 47.35 -122.00 2370 4348 0 6694 2015
3 300000 2 1.00 820 3844 1.00 0 0 4 6 ... 1916 0 98133 47.70 -122.35 1520 3844 0 4664 2014
4 699000 2 1.50 1400 4050 1.00 0 0 4 8 ... 1954 0 98115 47.68 -122.27 1900 5940 0 5450 2015

5 rows × 22 columns

Regression Model

In [30]:
print(data_city.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   price             21613 non-null  int64  
 1   room_bed          21613 non-null  int64  
 2   room_bath         21613 non-null  float64
 3   living_measure    21613 non-null  int64  
 4   lot_measure       21613 non-null  int64  
 5   ceil              21613 non-null  float64
 6   coast             21613 non-null  int64  
 7   sight             21613 non-null  int64  
 8   condition         21613 non-null  int64  
 9   quality           21613 non-null  int64  
 10  ceil_measure      21613 non-null  int64  
 11  basement          21613 non-null  int64  
 12  yr_built          21613 non-null  int64  
 13  yr_renovated      21613 non-null  int64  
 14  zipcode           21613 non-null  int64  
 15  lat               21613 non-null  float64
 16  long              21613 non-null  float64
 17  living_measure15  21613 non-null  int64  
 18  lot_measure15     21613 non-null  int64  
 19  furnished         21613 non-null  int64  
 20  total_area        21613 non-null  int64  
 21  Year              21613 non-null  int64  
dtypes: float64(4), int64(18)
memory usage: 3.6 MB
None
In [31]:
img=mpimg.imread('/content/drive/MyDrive/ML Capstone Project - 2022/Data/lake-forest-park-wa-5337270.gif')
data_city.plot(kind='scatter', x='long', y='lat', alpha=0.9,
				label='population', figsize=(25,20),c='price',s=data_city['living_measure']/100.0,cmap=plt.get_cmap('jet'),
			colorbar=True)


plt.imshow(img, extent=[-122.52, -121.31, 47.16, 47.78], alpha=0.2)
plt.ylabel("Latitude", fontsize=14)
plt.xlabel("Longitude", fontsize=14)
plt.legend() 
plt.show()

Below mentions the map from the latitude and longitue and mentions the price of the particular area when hovered around the particular point.

In [32]:
fig = px.scatter_mapbox(data_city, lat="lat", lon="long",                       
                        hover_data = ['price'],
                        color_discrete_sequence=["blue", "green"],
                        zoom=13,
                        height=800,
                        title  = 'Inner City Price Map')
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})

Set up the X and Y from dataset

In [33]:
#splitting the data into X and Y
x = data_city.drop("price", axis=1)
y = data_city["price"]
print(x.shape)
print(y.shape)
(21613, 21)
(21613,)

As discussed following regression models can be used to evaluate the models.

  1. Linear
  2. Ridge
  3. Lasso
  4. Random Forest
  5. KNN
  6. XGBoost
In [34]:
#splitting the data in test and train data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 547)
In [35]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)
print (x_train['Year'].unique())
print(x_train.info())
print (x_test['Year'].unique())
print(x_test.info())
(16209, 21)
(5404, 21)
(16209,)
(5404,)
[2014 2015]
<class 'pandas.core.frame.DataFrame'>
Int64Index: 16209 entries, 6437 to 8303
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   room_bed          16209 non-null  int64  
 1   room_bath         16209 non-null  float64
 2   living_measure    16209 non-null  int64  
 3   lot_measure       16209 non-null  int64  
 4   ceil              16209 non-null  float64
 5   coast             16209 non-null  int64  
 6   sight             16209 non-null  int64  
 7   condition         16209 non-null  int64  
 8   quality           16209 non-null  int64  
 9   ceil_measure      16209 non-null  int64  
 10  basement          16209 non-null  int64  
 11  yr_built          16209 non-null  int64  
 12  yr_renovated      16209 non-null  int64  
 13  zipcode           16209 non-null  int64  
 14  lat               16209 non-null  float64
 15  long              16209 non-null  float64
 16  living_measure15  16209 non-null  int64  
 17  lot_measure15     16209 non-null  int64  
 18  furnished         16209 non-null  int64  
 19  total_area        16209 non-null  int64  
 20  Year              16209 non-null  int64  
dtypes: float64(4), int64(17)
memory usage: 2.7 MB
None
[2014 2015]
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5404 entries, 1290 to 15517
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   room_bed          5404 non-null   int64  
 1   room_bath         5404 non-null   float64
 2   living_measure    5404 non-null   int64  
 3   lot_measure       5404 non-null   int64  
 4   ceil              5404 non-null   float64
 5   coast             5404 non-null   int64  
 6   sight             5404 non-null   int64  
 7   condition         5404 non-null   int64  
 8   quality           5404 non-null   int64  
 9   ceil_measure      5404 non-null   int64  
 10  basement          5404 non-null   int64  
 11  yr_built          5404 non-null   int64  
 12  yr_renovated      5404 non-null   int64  
 13  zipcode           5404 non-null   int64  
 14  lat               5404 non-null   float64
 15  long              5404 non-null   float64
 16  living_measure15  5404 non-null   int64  
 17  lot_measure15     5404 non-null   int64  
 18  furnished         5404 non-null   int64  
 19  total_area        5404 non-null   int64  
 20  Year              5404 non-null   int64  
dtypes: float64(4), int64(17)
memory usage: 928.8 KB
None

Regression models Evaluation

In [36]:
#creating the basic models 
lin_regression = LinearRegression()
lasso = Lasso()
ridge = Ridge()
randomForest = RandomForestRegressor()
knn = KNeighborsRegressor()
xgb = XGBRegressor()
gb=GradientBoostingRegressor( learning_rate = 0.1)
reg_models = [lin_regression,lasso,ridge,randomForest,knn,xgb,gb]
print(reg_models)
[LinearRegression(), Lasso(), Ridge(), RandomForestRegressor(), KNeighborsRegressor(), XGBRegressor(), GradientBoostingRegressor()]
In [37]:
#generating the scores 
model_output_phase1 = {}
for model in reg_models:
   
    mod_result = {}
    try:
        model.fit(x_train, y_train)      
        y_pred =  model.predict(x_test)
        mod_result['Train Accuracy'] =  model.score(x_train, y_train) * 100
        mod_result['Test Accuracy'] =  model.score(x_test, y_test) * 100
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        mae =mean_absolute_error(y_test, y_pred)
        mod_result['RMSE'] = np.sqrt(rmse)
        mod_result['Mean Absolute Error'] = mae
        mod_result['Predicted'] = y_pred
        model_output_phase1[str(model)] = mod_result
        print (str(model))        
    except Exception as e:
       print ('Exception for Model {0} : {1}'.format(str(model),e))    
y_pred = None
LinearRegression()
/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.818e+14, tolerance: 2.240e+11

Lasso()
Ridge()
RandomForestRegressor()
KNeighborsRegressor()
[11:08:34] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
XGBRegressor()
GradientBoostingRegressor()

Above method returns the value of the listed distributions from the list and the results as dictionary. FOllowing are the attributes in the results

  • Train Accuracy
  • Test Accuracy
  • RMSE
  • Mean Absolute Error
  • Y Predicted
In [38]:
for key , values in model_output_phase1.items():
   if key == 'Predicted':
     continue
   print()
   print(key)
   print(values)
LinearRegression()
{'Train Accuracy': 70.05329940387651, 'Test Accuracy': 70.58495973185084, 'RMSE': 437.99520666203034, 'Mean Absolute Error': 122930.76754197704, 'Predicted': array([185454.33613729, 465518.33613729, 146286.33613729, ...,
       656686.33613729, 161198.33613729, 404526.33613729])}

Lasso()
{'Train Accuracy': 70.05307353560586, 'Test Accuracy': 70.58264468250349, 'RMSE': 438.00382428232973, 'Mean Absolute Error': 122961.86385973057, 'Predicted': array([185347.12475184, 465511.39496383, 146340.95991906, ...,
       656887.54226743, 160974.11834866, 404772.29586333])}

Ridge()
{'Train Accuracy': 70.05286826471027, 'Test Accuracy': 70.57498923237516, 'RMSE': 438.03231757498713, 'Mean Absolute Error': 122944.13396779662, 'Predicted': array([185379.23601034, 465252.0181563 , 146291.10645954, ...,
       657160.97585708, 161151.6048288 , 405355.33599505])}

RandomForestRegressor()
{'Train Accuracy': 98.36593576985312, 'Test Accuracy': 87.51221709789763, 'RMSE': 353.5479560934401, 'Mean Absolute Error': 66754.70525682915, 'Predicted': array([259429.  , 364714.26, 343622.4 , ..., 466355.85, 253686.99,
       460177.01])}

KNeighborsRegressor()
{'Train Accuracy': 66.70609942101693, 'Test Accuracy': 48.36022900835394, 'RMSE': 504.1654409785823, 'Mean Absolute Error': 156970.3361954108, 'Predicted': array([428890., 503922., 303410., ..., 587769., 344150., 625000.])}

XGBRegressor()
{'Train Accuracy': 90.1484334308836, 'Test Accuracy': 86.7187476593104, 'RMSE': 359.0349716796548, 'Mean Absolute Error': 75344.7896986029, 'Predicted': array([252733.7 , 394670.16, 351136.94, ..., 515358.25, 251288.61,
       420238.38], dtype=float32)}

GradientBoostingRegressor()
{'Train Accuracy': 90.35037472923317, 'Test Accuracy': 86.89226228805995, 'RMSE': 357.8565177009843, 'Mean Absolute Error': 74931.64078910291, 'Predicted': array([253259.17713332, 394970.79688319, 348742.25843157, ...,
       499663.09385135, 253437.26369009, 419268.97930209])}

Regression Distributions of Actual Test vs Predicted (Joint plot)

Linear Regression

In [39]:
sns.jointplot(x=y_test, y=model_output_phase1['LinearRegression()'].get('Predicted'), kind="reg", color="m")
print ('Test Accuracy : {0} ,  RMSE : {1}'.format(model_output_phase1['LinearRegression()'].get('Test Accuracy'),model_output_phase1['LinearRegression()'].get('RMSE')))
Test Accuracy : 70.58495973185084 ,  RMSE : 437.99520666203034

Ridge

In [40]:
sns.jointplot(x=y_test, y=model_output_phase1['Ridge()'].get('Predicted'), kind="reg", color="m")
print ('Test Accuracy : {0} ,  RMSE : {1}'.format(model_output_phase1['Ridge()'].get('Test Accuracy'),model_output_phase1['Ridge()'].get('RMSE')))
Test Accuracy : 70.57498923237516 ,  RMSE : 438.03231757498713

Lasso

In [41]:
sns.jointplot(x=y_test, y=model_output_phase1['Lasso()'].get('Predicted'), kind="reg", color="m")
print ('Test Accuracy : {0} ,  RMSE : {1}'.format(model_output_phase1['Lasso()'].get('Test Accuracy'),model_output_phase1['Lasso()'].get('RMSE')))
Test Accuracy : 70.58264468250349 ,  RMSE : 438.00382428232973

Random Forest

In [42]:
sns.jointplot(x=y_test, y=model_output_phase1['RandomForestRegressor()'].get('Predicted'), kind="reg", color="m")
print ('Test Accuracy : {0} ,  RMSE : {1}'.format(model_output_phase1['RandomForestRegressor()'].get('Test Accuracy'),model_output_phase1['RandomForestRegressor()'].get('RMSE')))
Test Accuracy : 87.51221709789763 ,  RMSE : 353.5479560934401

KNN

In [43]:
sns.jointplot(x=y_test, y=model_output_phase1['KNeighborsRegressor()'].get('Predicted'), kind="reg", color="m")
print ('Test Accuracy : {0} ,  RMSE : {1}'.format(model_output_phase1['KNeighborsRegressor()'].get('Test Accuracy'),model_output_phase1['KNeighborsRegressor()'].get('RMSE')))
Test Accuracy : 48.36022900835394 ,  RMSE : 504.1654409785823

XGBoost

In [44]:
sns.jointplot(x=y_test, y=model_output_phase1['XGBRegressor()'].get('Predicted'), kind="reg", color="m")
print ('Test Accuracy : {0} ,  RMSE : {1}'.format(model_output_phase1['XGBRegressor()'].get('Test Accuracy'),model_output_phase1['XGBRegressor()'].get('RMSE')))
Test Accuracy : 86.7187476593104 ,  RMSE : 359.0349716796548

Gradient Boosting

In [45]:
sns.jointplot(x=y_test, y=model_output_phase1['GradientBoostingRegressor()'].get('Predicted'), kind="reg", color="m")
print ('Test Accuracy : {0} ,  RMSE : {1}'.format(model_output_phase1['GradientBoostingRegressor()'].get('Test Accuracy'),model_output_phase1['GradientBoostingRegressor()'].get('RMSE')))
Test Accuracy : 86.89226228805995 ,  RMSE : 357.8565177009843

From the above graphs and the result, it can be inferred that the following models perform for the regression metrics compared to the other regression models used.

  1. Random Forest
  2. XG Boost

We will use the above models with hyper parameters, PCA, feature selection to score a better result.

In [46]:
# Transform the  X  variable  to zscores and  create the PCA dimensions.
data_pca = data_city.drop(['price'], axis = 1)
print (data_pca.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   room_bed          21613 non-null  int64  
 1   room_bath         21613 non-null  float64
 2   living_measure    21613 non-null  int64  
 3   lot_measure       21613 non-null  int64  
 4   ceil              21613 non-null  float64
 5   coast             21613 non-null  int64  
 6   sight             21613 non-null  int64  
 7   condition         21613 non-null  int64  
 8   quality           21613 non-null  int64  
 9   ceil_measure      21613 non-null  int64  
 10  basement          21613 non-null  int64  
 11  yr_built          21613 non-null  int64  
 12  yr_renovated      21613 non-null  int64  
 13  zipcode           21613 non-null  int64  
 14  lat               21613 non-null  float64
 15  long              21613 non-null  float64
 16  living_measure15  21613 non-null  int64  
 17  lot_measure15     21613 non-null  int64  
 18  furnished         21613 non-null  int64  
 19  total_area        21613 non-null  int64  
 20  Year              21613 non-null  int64  
dtypes: float64(4), int64(17)
memory usage: 3.5 MB
None
In [47]:
x_scaled = data_pca.apply(zscore)
x_scaled.head()
#scaled data
Out[47]:
room_bed room_bath living_measure lot_measure ceil coast sight condition quality ceil_measure ... yr_built yr_renovated zipcode lat long living_measure15 lot_measure15 furnished total_area Year
0 0.68 1.47 1.02 -0.04 -0.92 -0.09 -0.31 2.44 1.14 1.49 ... -0.51 -0.21 1.03 1.14 -0.87 0.19 -0.19 2.02 -0.02 -0.69
1 0.68 0.50 0.51 -0.18 -0.92 -0.09 -0.31 -0.63 0.29 -0.05 ... 0.17 -0.21 -1.03 -1.76 -1.22 0.40 -0.15 -0.49 -0.17 -0.69
2 -0.40 0.50 0.32 -0.26 0.94 -0.09 -0.31 -0.63 0.29 0.70 ... 1.19 -0.21 -0.75 -1.51 1.53 0.56 -0.31 -0.49 -0.25 1.45
3 -1.47 -1.45 -1.37 -0.27 -0.92 -0.09 -0.31 0.91 -1.41 -1.17 ... -1.87 -0.21 1.03 1.05 -0.96 -0.68 -0.33 -0.49 -0.30 -0.69
4 -1.47 -0.80 -0.74 -0.27 -0.92 -0.09 -0.31 0.91 0.29 -0.47 ... -0.58 -0.21 0.69 0.84 -0.39 -0.13 -0.25 -0.49 -0.28 1.45

5 rows × 21 columns

In [48]:
covMatrix = np.cov(x_scaled,rowvar=False)
print(covMatrix)
[[ 1.00004627e+00  5.15907508e-01  5.76697375e-01  3.17047099e-02
   1.75437052e-01 -6.58278324e-03  7.95355318e-02  2.84734219e-02
   3.56983242e-01  4.77622260e-01  3.03107400e-01  1.54185203e-01
   1.88416949e-02 -1.52675551e-01 -8.93142293e-03  1.29478966e-01
   3.91655645e-01  2.92455768e-02  2.59279861e-01  4.43118170e-02
  -9.83888879e-03]
 [ 5.15907508e-01  1.00004627e+00  7.54700198e-01  8.77437213e-02
   5.00676338e-01  6.37465786e-02  1.87745711e-01 -1.24987716e-01
   6.65013303e-01  6.85374187e-01  2.83783164e-01  5.06042852e-01
   5.07413254e-02 -2.03875707e-01  2.45740898e-02  2.23052163e-01
   5.68660601e-01  8.71793945e-02  4.84945275e-01  1.04054651e-01
  -2.65972150e-02]
 [ 5.76697375e-01  7.54700198e-01  1.00004627e+00  1.72833658e-01
   3.53965668e-01  1.03822621e-01  2.84624355e-01 -5.87553054e-02
   7.62739767e-01  8.76637159e-01  4.35063103e-01  3.18063485e-01
   5.53654885e-02 -1.99439271e-01  5.25318928e-02  2.40234413e-01
   7.56455259e-01  1.83294032e-01  6.32975918e-01  1.94217850e-01
  -2.90396848e-02]
 [ 3.17047099e-02  8.77437213e-02  1.72833658e-01  1.00004627e+00
  -5.20123153e-03  2.16046829e-02  7.47135624e-02 -8.95866401e-03
   1.13626381e-01  1.83520772e-01  1.52869089e-02  5.30828231e-02
   7.64385868e-03 -1.29580482e-01 -8.56867528e-02  2.29531479e-01
   1.44614865e-01  7.18590000e-01  1.18888623e-01  9.99809670e-01
   5.46868425e-03]
 [ 1.75437052e-01  5.00676338e-01  3.53965668e-01 -5.20123153e-03
   1.00004627e+00  2.36994168e-02  2.94451826e-02 -2.63780150e-01
   4.58203714e-01  5.23908951e-01 -2.45715911e-01  4.89342066e-01
   6.33869404e-03 -5.91233772e-02  4.96164267e-02  1.25424831e-01
   2.79898216e-01 -1.12697081e-02  3.47765171e-01  2.63672271e-03
  -2.23159338e-02]
 [-6.58278324e-03  6.37465786e-02  1.03822621e-01  2.16046829e-02
   2.36994168e-02  1.00004627e+00  4.01875945e-01  1.66539280e-02
   8.27787439e-02  7.20779266e-02  8.05916678e-02 -2.61622961e-02
   9.28891346e-02  3.02861289e-02 -1.42744361e-02 -4.19121393e-02
   8.64671368e-02  3.07047038e-02  6.98850257e-02  2.38099022e-02
  -4.16494752e-03]
 [ 7.95355318e-02  1.87745711e-01  2.84624355e-01  7.47135624e-02
   2.94451826e-02  4.01875945e-01  1.00004627e+00  4.59918646e-02
   2.51332213e-01  1.67657101e-01  2.76959393e-01 -5.34423241e-02
   1.03922096e-01  8.48308419e-02  6.15701695e-03 -7.84033399e-02
   2.80452058e-01  7.25779259e-02  2.20260582e-01  8.06962915e-02
   1.36387939e-03]
 [ 2.84734219e-02 -1.24987716e-01 -5.87553054e-02 -8.95866401e-03
  -2.63780150e-01  1.66539280e-02  4.59918646e-02  1.00004627e+00
  -1.44680365e-01 -1.58220937e-01  1.74112970e-01 -3.61433285e-01
  -6.06205914e-02  3.02566389e-03 -1.49416977e-02 -1.06505376e-01
  -9.28285631e-02 -3.40568056e-03 -1.21907768e-01 -1.02198873e-02
  -4.55915001e-02]
 [ 3.56983242e-01  6.65013303e-01  7.62739767e-01  1.13626381e-01
   4.58203714e-01  8.27787439e-02  2.51332213e-01 -1.44680365e-01
   1.00004627e+00  7.55957915e-01  1.68399616e-01  4.46983886e-01
   1.44149477e-02 -1.84870647e-01  1.14089336e-01  1.98381332e-01
   7.13235093e-01  1.19253415e-01  7.88657626e-01  1.30009946e-01
  -3.03882166e-02]
 [ 4.77622260e-01  6.85374187e-01  8.76637159e-01  1.83520772e-01
   5.23908951e-01  7.20779266e-02  1.67657101e-01 -1.58220937e-01
   7.55957915e-01  1.00004627e+00 -5.19457102e-02  4.23917966e-01
   2.32857653e-02 -2.61202062e-01 -8.16536350e-04  3.43818925e-01
   7.31904156e-01  1.94058841e-01  6.52413196e-01  2.02136316e-01
  -2.38239183e-02]
 [ 3.03107400e-01  2.83783164e-01  4.35063103e-01  1.52869089e-02
  -2.45715911e-01  8.05916678e-02  2.76959393e-01  1.74112970e-01
   1.68399616e-01 -5.19457102e-02  1.00004627e+00 -1.33130259e-01
   7.13262019e-02  7.48480709e-02  1.10543073e-01 -1.44771472e-01
   2.00364254e-01  1.72769799e-02  9.28508942e-02  2.48327519e-02
  -1.56874240e-02]
 [ 1.54185203e-01  5.06042852e-01  3.18063485e-01  5.30828231e-02
   4.89342066e-01 -2.61622961e-02 -5.34423241e-02 -3.61433285e-01
   4.46983886e-01  4.23917966e-01 -1.33130259e-01  1.00004627e+00
  -2.24883923e-01 -3.46885228e-01 -1.48129256e-01  4.09375144e-01
   3.26243994e-01  7.09612097e-02  3.05239289e-01  5.98916861e-02
   3.50748286e-03]
 [ 1.88416949e-02  5.07413254e-02  5.53654885e-02  7.64385868e-03
   6.33869404e-03  9.28891346e-02  1.03922096e-01 -6.06205914e-02
   1.44149477e-02  2.32857653e-02  7.13262019e-02 -2.24883923e-01
   1.00004627e+00  6.43600351e-02  2.93989695e-02 -6.83755323e-02
  -2.67267911e-03  7.85412844e-03  1.72126570e-02  8.83555142e-03
  -2.37078865e-02]
 [-1.52675551e-01 -2.03875707e-01 -1.99439271e-01 -1.29580482e-01
  -5.91233772e-02  3.02861289e-02  8.48308419e-02  3.02566389e-03
  -1.84870647e-01 -2.61202062e-01  7.48480709e-02 -3.46885228e-01
   6.43600351e-02  1.00004627e+00  2.67060306e-01 -5.64097706e-01
  -2.79045908e-01 -1.47227881e-01 -1.38802710e-01 -1.33459621e-01
   1.18949364e-03]
 [-8.93142293e-03  2.45740898e-02  5.25318928e-02 -8.56867528e-02
   4.96164267e-02 -1.42744361e-02  6.15701695e-03 -1.49416977e-02
   1.14089336e-01 -8.16536350e-04  1.10543073e-01 -1.48129256e-01
   2.93989695e-02  2.67060306e-01  1.00004627e+00 -1.35518054e-01
   4.88601928e-02 -8.64228058e-02  8.09557979e-02 -8.41793469e-02
  -2.92137936e-02]
 [ 1.29478966e-01  2.23052163e-01  2.40234413e-01  2.29531479e-01
   1.25424831e-01 -4.19121393e-02 -7.84033399e-02 -1.06505376e-01
   1.98381332e-01  3.43818925e-01 -1.44771472e-01  4.09375144e-01
  -6.83755323e-02 -5.64097706e-01 -1.35518054e-01  1.00004627e+00
   3.34620466e-01  2.54463061e-01  1.87527620e-01  2.33906403e-01
   2.69756012e-04]
 [ 3.91655645e-01  5.68660601e-01  7.56455259e-01  1.44614865e-01
   2.79898216e-01  8.64671368e-02  2.80452058e-01 -9.28285631e-02
   7.13235093e-01  7.31904156e-01  2.00364254e-01  3.26243994e-01
  -2.67267911e-03 -2.79045908e-01  4.88601928e-02  3.34620466e-01
   1.00004627e+00  1.83200225e-01  6.20163287e-01  1.60734042e-01
  -2.17351045e-02]
 [ 2.92455768e-02  8.71793945e-02  1.83294032e-01  7.18590000e-01
  -1.12697081e-02  3.07047038e-02  7.25779259e-02 -3.40568056e-03
   1.19253415e-01  1.94058841e-01  1.72769799e-02  7.09612097e-02
   7.85412844e-03 -1.47227881e-01 -8.64228058e-02  2.54463061e-01
   1.83200225e-01  1.00004627e+00  1.29349988e-01  7.19725208e-01
  -8.49481659e-05]
 [ 2.59279861e-01  4.84945275e-01  6.32975918e-01  1.18888623e-01
   3.47765171e-01  6.98850257e-02  2.20260582e-01 -1.21907768e-01
   7.88657626e-01  6.52413196e-01  9.28508942e-02  3.05239289e-01
   1.72126570e-02 -1.38802710e-01  8.09557979e-02  1.87527620e-01
   6.20163287e-01  1.29349988e-01  1.00004627e+00  1.32385194e-01
  -2.51122351e-02]
 [ 4.43118170e-02  1.04054651e-01  1.94217850e-01  9.99809670e-01
   2.63672271e-03  2.38099022e-02  8.06962915e-02 -1.02198873e-02
   1.30009946e-01  2.02136316e-01  2.48327519e-02  5.98916861e-02
   8.83555142e-03 -1.33459621e-01 -8.41793469e-02  2.33906403e-01
   1.60734042e-01  7.19725208e-01  1.32385194e-01  1.00004627e+00
   4.80521471e-03]
 [-9.83888879e-03 -2.65972150e-02 -2.90396848e-02  5.46868425e-03
  -2.23159338e-02 -4.16494752e-03  1.36387939e-03 -4.55915001e-02
  -3.03882166e-02 -2.38239183e-02 -1.56874240e-02  3.50748286e-03
  -2.37078865e-02  1.18949364e-03 -2.92137936e-02  2.69756012e-04
  -2.17351045e-02 -8.49481659e-05 -2.51122351e-02  4.80521471e-03
   1.00004627e+00]]
In [49]:
pca = PCA()
pca.fit(x_scaled)
Out[49]:
PCA()
In [50]:
print ('Eigen Values')
print (pca.explained_variance_)
Eigen Values
[5.81735819e+00 2.63217938e+00 2.16496525e+00 1.43261918e+00
 1.24538794e+00 1.01909127e+00 1.00635383e+00 9.13826682e-01
 8.37035171e-01 7.43851870e-01 6.00527180e-01 5.28716294e-01
 4.14875186e-01 3.69521441e-01 3.54663788e-01 3.12433762e-01
 2.54644826e-01 2.00977220e-01 1.51943211e-01 3.58864846e-31
 1.71707637e-32]
In [51]:
print('Eigen Vectors')
print(pca.explained_variance_ratio_)
Eigen Vectors
[2.77004240e-01 1.25336076e-01 1.03088813e-01 6.82168046e-02
 5.93014437e-02 4.85259105e-02 4.79193936e-02 4.35135429e-02
 3.98569735e-02 3.54198787e-02 2.85952093e-02 2.51758015e-02
 1.97550472e-02 1.75954450e-02 1.68879704e-02 1.48771098e-02
 1.21253830e-02 9.56990100e-03 7.23505623e-03 1.70880115e-32
 8.17617581e-34]
In [52]:
print(len(pca.explained_variance_ratio_))
21
In [53]:
tot = sum(pca.explained_variance_)
var_explained = [(i / tot) for i in sorted(pca.explained_variance_, reverse=True)]  

# an array of variance explained by each 

# eigen vector... there will be 90 entries as there are 90 eigen vectors)
cum_var_exp = np.cumsum(var_explained)  

print(len(var_explained))
print(len(cum_var_exp))
print(cum_var_exp)
21
21
[0.27700424 0.40234032 0.50542913 0.57364593 0.63294738 0.68147329
 0.72939268 0.77290622 0.8127632  0.84818308 0.87677829 0.90195409
 0.92170913 0.93930458 0.95619255 0.97106966 0.98319504 0.99276494
 1.         1.         1.        ]

It can be inferred that 85% of th variance are contributed by 18 features

Ensemble Feature Selection

Feature Selection for Random Forest

In [54]:
plt.figure(figsize=(15, 10))
plt.bar(range(0,21), var_explained, alpha = 0.5, align='center', label='individual explained variance')
plt.step(range(0,21), cum_var_exp,  label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc = 'best')
plt.show()
In [55]:
#feature importance
rf_feature=pd.DataFrame(randomForest.feature_importances_, columns = ["Imp"], index = x_train.columns)
rf_feature.sort_values(by="Imp",ascending=False)
rf_feature['Imp'] = rf_feature['Imp'].map('{0:.5f}'.format)
rf_feature=rf_feature.sort_values(by="Imp",ascending=False)
rf_feature.Imp=rf_feature.Imp.astype("float")
rf_feature[:21].plot.bar(figsize=(15, 10))

print (rf_feature[:21])
print("First 7 feature importance:\t",(rf_feature[:7].sum())*100)
print("First 10 feature importance:\t",(rf_feature[:10].sum())*100)
print("First 12 feature importance:\t",(rf_feature[:12].sum())*100)
print("First 13 feature importance:\t",(rf_feature[:13].sum())*100)
print("First 15 feature importance:\t",(rf_feature[:15].sum())*100)
print("First 20 feature importance:\t",(rf_feature[:20].sum())*100)
                  Imp
living_measure   0.29
quality          0.21
lat              0.16
furnished        0.09
long             0.06
living_measure15 0.04
coast            0.03
yr_built         0.03
ceil_measure     0.02
zipcode          0.01
lot_measure15    0.01
sight            0.01
total_area       0.01
lot_measure      0.01
room_bath        0.01
basement         0.01
room_bed         0.00
condition        0.00
yr_renovated     0.00
Year             0.00
ceil             0.00
First 7 feature importance:	 Imp   88.24
dtype: float64
First 10 feature importance:	 Imp   93.78
dtype: float64
First 12 feature importance:	 Imp   96.05
dtype: float64
First 13 feature importance:	 Imp   96.95
dtype: float64
First 15 feature importance:	 Imp   98.29
dtype: float64
First 20 feature importance:	 Imp   99.81
dtype: float64

Feature Selection for XGB

In [56]:
#feature importance for XGB
xgb_feature=pd.DataFrame(xgb.feature_importances_, columns = ["Imp"], index = x_train.columns)
xgb_feature.sort_values(by="Imp",ascending=False)
xgb_feature['Imp'] = xgb_feature['Imp'].map('{0:.5f}'.format)
xgb_feature=xgb_feature.sort_values(by="Imp",ascending=False)
xgb_feature.Imp=xgb_feature.Imp.astype("float")
xgb_feature[:21].plot.bar(figsize=(15, 10))

print (xgb_feature[:21])
#First 20 features have an importance of 90.5% and first 30 have importance of 95.15
print("First 7 feature importance:\t",(xgb_feature[:7].sum())*100)
print("First 10 feature importance:\t",(xgb_feature[:10].sum())*100)
print("First 12 feature importance:\t",(xgb_feature[:12].sum())*100)
print("First 13 feature importance:\t",(xgb_feature[:13].sum())*100)
print("First 15 feature importance:\t",(xgb_feature[:15].sum())*100)
print("First 20 feature importance:\t",(xgb_feature[:20].sum())*100)
                  Imp
quality          0.36
living_measure   0.19
coast            0.07
sight            0.06
long             0.06
lat              0.06
yr_built         0.05
living_measure15 0.04
ceil_measure     0.02
zipcode          0.02
room_bath        0.01
yr_renovated     0.01
condition        0.01
total_area       0.01
lot_measure15    0.01
Year             0.01
lot_measure      0.00
basement         0.00
ceil             0.00
room_bed         0.00
furnished        0.00
First 7 feature importance:	 Imp   85.50
dtype: float64
First 10 feature importance:	 Imp   93.62
dtype: float64
First 12 feature importance:	 Imp   95.52
dtype: float64
First 13 feature importance:	 Imp   96.42
dtype: float64
First 15 feature importance:	 Imp   97.97
dtype: float64
First 20 feature importance:	 Imp   100.00
dtype: float64

From above we could see that the first 12 features in both Random Forest and XGB equates to 96%.

XGB - 95.94

Random Forest: 96.02

In [57]:
feature_xgb = [ 'quality','living_measure','coast',  'sight',          'long',          'lat',           'yr_built',
'living_measure15','ceil_measure','zipcode',   'room_bath','yr_renovated']      

feature_rf = ['living_measure','quality','lat','long','furnished','living_measure15','coast','yr_built','ceil_measure',
'zipcode','sight','lot_measure15']

features = set(feature_xgb).union(feature_rf)

features = list(features)
features.append('price')
print(features)
['room_bath', 'furnished', 'quality', 'yr_built', 'sight', 'yr_renovated', 'lat', 'coast', 'ceil_measure', 'lot_measure15', 'long', 'living_measure15', 'living_measure', 'zipcode', 'price']
In [58]:
data_pca = data_city[features]
print (data_pca.head())
   room_bath  furnished  quality  yr_built  sight  yr_renovated   lat  coast  \
0       3.25          1        9      1956      0             0 47.72      0   
1       2.50          0        8      1976      0             0 47.32      0   
2       2.50          0        8      2006      0             0 47.35      0   
3       1.00          0        6      1916      0             0 47.70      0   
4       1.50          0        8      1954      0             0 47.68      0   

   ceil_measure  lot_measure15    long  living_measure15  living_measure  \
0          3020           7553 -122.34              2120            3020   
1          1750           8800 -122.39              2260            2550   
2          2370           4348 -122.00              2370            2370   
3           820           3844 -122.35              1520             820   
4          1400           5940 -122.27              1900            1400   

   zipcode   price  
0    98133  808100  
1    98023  277500  
2    98038  404000  
3    98133  300000  
4    98115  699000  

With the reduced features, we will proceed with the Grid Search operation for feature importance

In [59]:
xgs = data_pca.drop("price" , axis=1)
ygs = data_pca["price"]

#xgs_train, xgs_test, ygs_train, ygs_test = train_test_split(xgs, ygs, test_size=0.2, random_state=10)
xgs_train, xgs_test, ygs_train, ygs_test = train_test_split(xgs, ygs, test_size=0.2, random_state=10)

print(xgs_train.shape)
print(xgs_test.shape)

print(ygs_train.shape)
print(ygs_test.shape)
(17290, 14)
(4323, 14)
(17290,)
(4323,)

Lets Create a new Dataframe to hold all the results which are needed. This is just a place holder. Initializing a dataframe simply and then adding them one by one is a costly affair in terms of memory. We will create them on the go with the below variable.

In [63]:
pd_reg_search_results  = None

Hyper Tuning the paramters for Random Forest

In [64]:
RF_ht = RandomForestRegressor()

params_rf = {"n_estimators": np.arange(70,90,1),"max_depth": np.arange(12,20,1),
          "max_features":np.arange(5,10,1),'min_samples_leaf': range(4, 8, 1),
    'min_samples_split': range(16, 20, 1)}

rf_random = RandomizedSearchCV(estimator = RF_ht, param_distributions = params_rf,
               n_iter = 5, cv = 5, verbose=2, random_state=35, n_jobs = -1)
rf_random.fit(xgs_train,ygs_train)
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Out[64]:
RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=5, n_jobs=-1,
                   param_distributions={'max_depth': array([12, 13, 14, 15, 16, 17, 18, 19]),
                                        'max_features': array([5, 6, 7, 8, 9]),
                                        'min_samples_leaf': range(4, 8),
                                        'min_samples_split': range(16, 20),
                                        'n_estimators': array([70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86,
       87, 88, 89])},
                   random_state=35, verbose=2)
In [65]:
print (rf_random.best_params_)
print (rf_random.best_score_)
{'n_estimators': 85, 'min_samples_split': 16, 'min_samples_leaf': 4, 'max_features': 9, 'max_depth': 12}
0.8601633999721257
In [66]:
params_rf = {'min_samples_leaf': range(3, 9, 1),
    }

rf_random = RandomizedSearchCV(estimator = RF_ht, param_distributions = params_rf,
               n_iter = 5, cv = 5, verbose=2, random_state=35, n_jobs = -1)
rf_random.fit(xgs_train,ygs_train)
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Out[66]:
RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=5, n_jobs=-1,
                   param_distributions={'min_samples_leaf': range(3, 9)},
                   random_state=35, verbose=2)
In [67]:
print (rf_random.best_params_)
print (rf_random.best_score_)
{'min_samples_leaf': 3}
0.8720355563713097
In [68]:
params_rf = {"max_depth": np.arange(12,20,1),
    }

rf_random2 = RandomizedSearchCV(estimator = RF_ht, param_distributions = params_rf,
               n_iter = 5, cv = 5, verbose=2, random_state=35, n_jobs = -1)
rf_random2.fit(xgs_train,ygs_train)
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Out[68]:
RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=5, n_jobs=-1,
                   param_distributions={'max_depth': array([12, 13, 14, 15, 16, 17, 18, 19])},
                   random_state=35, verbose=2)
In [69]:
print (rf_random2.best_params_)
print (rf_random2.best_score_)
{'max_depth': 17}
0.8772115330848544
In [70]:
'''
Can we make this as a separate function as this will be repeated in further models. Added here for better readability.
Based on Tomorrow's discussion we can modify and fine tune again for better readability
'''
def calc_RandomForestRegressor(x_train, x_test, y_train, y_test,n_estimators= 0, min_samples_split= 0,
                               min_samples_leaf= 0, max_features= 0, max_depth= 0, name  = ''):
  
  rfg = RandomForestRegressor(n_estimators= n_estimators, min_samples_split= min_samples_split, min_samples_leaf= min_samples_leaf,
                              max_features= max_features, max_depth= max_depth)
  
  '''
  print(x_train.shape)
  print(x_test.shape)
  print(y_train.shape)
  print(y_test.shape)
  '''
  rfg.fit(x_train, y_train)      
  y_pred_rfg =  rfg.predict(x_test)
  train_score = rfg.score(x_train, y_train) * 100
  test_score = rfg.score(x_test, y_test) * 100
  rmse = mean_squared_error(y_test, y_pred_rfg, squared=False)
  mae =mean_absolute_error(y_test, y_pred_rfg)
  rmse_val =  np.sqrt(rmse)  
  r2_score_val = r2_score(y_test, y_pred_rfg)
  r2_score_variance_weighted = r2_score(y_test, y_pred_rfg, multioutput='variance_weighted')
  print('Train Score: {0}%'.format(train_score))
  print('Test Score: {0}%'.format(test_score))  
  print('RMSE:{0}'.format(rmse_val))
  print( 'Mean Absolute Error:{0}'.format(mae))  
  print ('R Square: {0}'.format(r2_score_val))
  print ('R Square Variance Weighted : {0}'.format(r2_score_val))
  return {'Method':name,'Train Square':train_score,'Test Score':test_score,'RMSE':rmse_val,
            'MAE':mae,'R Square':r2_score_val,'R Square Variance Weighted':r2_score_variance_weighted,
            'Y Pred':y_pred_rfg}
In [71]:
rf_result_random = calc_RandomForestRegressor(xgs_train, xgs_test, ygs_train, ygs_test,
                                              n_estimators= 83, min_samples_split=19, min_samples_leaf= 4,
                                              max_features= 8, max_depth= 15, name = 'RF Train')
print(rf_result_random)
Train Score: 92.05038088508206%
Test Score: 87.02615024903054%
RMSE:358.88297799102526
Mean Absolute Error:71362.7158731428
R Square: 0.8702615024903054
R Square Variance Weighted : 0.8702615024903054
{'Method': 'RF Train', 'Train Square': 92.05038088508206, 'Test Score': 87.02615024903054, 'RMSE': 358.88297799102526, 'MAE': 71362.7158731428, 'R Square': 0.8702615024903054, 'R Square Variance Weighted': 0.8702615024903054, 'Y Pred': array([ 433626.63866863,  455466.69526035,  495623.50904675, ...,
        355794.26535429, 1239651.57571414,  324044.20702602])}

Creating the result Dataframe to add the rows

In [72]:
pd_reg_search_results = pd.DataFrame({'Method':rf_result_random.get('Method') ,
                                      'Name':'Random Forest (estimators:83, max_depth:15, max_features:8,min_samples_leaf:4)',
                                      'Train Score (%)':rf_result_random.get('Train Square'),
                                      'Test Score (%)':rf_result_random.get('Test Score'),
                                      'R Square':rf_result_random.get('R Square'),                                      
                                      },index = [0]
                                     )
In [73]:
pd_reg_search_results
Out[73]:
Method Name Train Score (%) Test Score (%) R Square
0 RF Train Random Forest (estimators:83, max_depth:15, ma... 92.05 87.03 0.87
In [74]:
print (rf_feature[:21])
print("First 7 feature importance:\t",(rf_feature[:7].sum())*100)
print("First 10 feature importance:\t",(rf_feature[:10].sum())*100)
print("First 12 feature importance:\t",(rf_feature[:12].sum())*100)
                  Imp
living_measure   0.29
quality          0.21
lat              0.16
furnished        0.09
long             0.06
living_measure15 0.04
coast            0.03
yr_built         0.03
ceil_measure     0.02
zipcode          0.01
lot_measure15    0.01
sight            0.01
total_area       0.01
lot_measure      0.01
room_bath        0.01
basement         0.01
room_bed         0.00
condition        0.00
yr_renovated     0.00
Year             0.00
ceil             0.00
First 7 feature importance:	 Imp   88.24
dtype: float64
First 10 feature importance:	 Imp   93.78
dtype: float64
First 12 feature importance:	 Imp   96.05
dtype: float64

Taking only the top 10 columns to see if that helps in the performance of the Random Forest REgressor. Also the columns have very less weightage.

In [75]:
data_pca1 = data_city[features]
data_pca1.drop(["sight","lot_measure15"] ,inplace = True,axis = 1)
print(data_pca1.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   room_bath         21613 non-null  float64
 1   furnished         21613 non-null  int64  
 2   quality           21613 non-null  int64  
 3   yr_built          21613 non-null  int64  
 4   yr_renovated      21613 non-null  int64  
 5   lat               21613 non-null  float64
 6   coast             21613 non-null  int64  
 7   ceil_measure      21613 non-null  int64  
 8   long              21613 non-null  float64
 9   living_measure15  21613 non-null  int64  
 10  living_measure    21613 non-null  int64  
 11  zipcode           21613 non-null  int64  
 12  price             21613 non-null  int64  
dtypes: float64(3), int64(10)
memory usage: 2.1 MB
None
/usr/local/lib/python3.7/dist-packages/pandas/core/frame.py:4913: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [76]:
xgs1 = data_pca1.drop("price" , axis=1)
ygs1 = data_pca1["price"]

#xgs_train, xgs_test, ygs_train, ygs_test = train_test_split(xgs, ygs, test_size=0.2, random_state=10)
xgs_train1, xgs_test1, ygs_train1, ygs_test1 = train_test_split(xgs1, ygs1, test_size=0.2, random_state=10)

print(xgs_train1.shape)
print(xgs_test1.shape)

print(ygs_train1.shape)
print(ygs_test1.shape)
(17290, 12)
(4323, 12)
(17290,)
(4323,)
In [77]:
rf_res_reduced_train = calc_RandomForestRegressor(xgs_train1, xgs_test1, ygs_train1, ygs_test1,n_estimators= 83,
                                                  min_samples_split=19, min_samples_leaf= 4, max_features= 8,
                                                  max_depth= 15, name = 'RF Train2')
Train Score: 91.99712269984065%
Test Score: 86.90472259565439%
RMSE:359.71978044482813
Mean Absolute Error:73211.79662146665
R Square: 0.8690472259565439
R Square Variance Weighted : 0.8690472259565439
In [78]:
pd_reg_search_results
Out[78]:
Method Name Train Score (%) Test Score (%) R Square
0 RF Train Random Forest (estimators:83, max_depth:15, ma... 92.05 87.03 0.87
In [79]:
df2 = {'Method':rf_res_reduced_train.get('Method') ,
                                      'Name':'Features Removed Random Forest (estimators:83, max_depth:15, max_features:8,min_samples_leaf:4)',
                                      'Train Score (%)':rf_result_random.get('Train Square'),
                                      'Test Score (%)':rf_res_reduced_train.get('Test Score'),
                                      'R Square':rf_res_reduced_train.get('R Square'),
                                      }
                                    

pd_reg_search_results = pd_reg_search_results.append(df2,ignore_index=True, sort = False)

From the reduced train it can be inferred that the columns living measure, lot measure and ceil measure are multi colliear and related to each other. Lets try dropping the repeated columns and see the improvement from 87%.

Following are the columns which can be dropped.

  • living_measure15
  • ceil_measure
In [80]:
data_pca2 = data_pca1
data_pca2.drop(["living_measure15","ceil_measure"] ,inplace = True,axis = 1)
print(data_pca2.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   room_bath       21613 non-null  float64
 1   furnished       21613 non-null  int64  
 2   quality         21613 non-null  int64  
 3   yr_built        21613 non-null  int64  
 4   yr_renovated    21613 non-null  int64  
 5   lat             21613 non-null  float64
 6   coast           21613 non-null  int64  
 7   long            21613 non-null  float64
 8   living_measure  21613 non-null  int64  
 9   zipcode         21613 non-null  int64  
 10  price           21613 non-null  int64  
dtypes: float64(3), int64(8)
memory usage: 1.8 MB
None
/usr/local/lib/python3.7/dist-packages/pandas/core/frame.py:4913: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [81]:
xgs2 = data_pca2.drop("price" , axis=1)
ygs2 = data_pca2["price"]

#xgs_train, xgs_test, ygs_train, ygs_test = train_test_split(xgs, ygs, test_size=0.2, random_state=10)
xgs_train2, xgs_test2, ygs_train2, ygs_test2 = train_test_split(xgs2, ygs2, test_size=0.2, random_state=10)

print(xgs_train2.shape)
print(xgs_test2.shape)

print(ygs_train2.shape)
print(ygs_test2.shape)
(17290, 10)
(4323, 10)
(17290,)
(4323,)
In [82]:
rf_res_reduced_train1 = calc_RandomForestRegressor(xgs_train2, xgs_test2, ygs_train2, ygs_test2,n_estimators= 83,
                                                  min_samples_split=19, min_samples_leaf= 3, max_features= 8,
                                                  max_depth= 15, name = 'RF Train3')
Train Score: 91.86644015957037%
Test Score: 86.74216468572075%
RMSE:360.8309643219248
Mean Absolute Error:72436.38222645145
R Square: 0.8674216468572076
R Square Variance Weighted : 0.8674216468572076
In [83]:
df3 = {'Method':rf_res_reduced_train1.get('Method') ,
                                      'Train Score (%)':rf_res_reduced_train1.get('Train Square'),
                                      'Name':'Multi collinear removed frmo Random Forest(estimators:83, max_depth:15, max_features:8,min_samples_leaf:4)',
                                      'Test Score (%)':rf_res_reduced_train1.get('Test Score'),
                                      'R Square':rf_res_reduced_train1.get('R Square')
                                      }
                                    

pd_reg_search_results = pd_reg_search_results.append(df3,ignore_index=True, sort = False)
In [84]:
pd_reg_search_results.iloc[0,1]
Out[84]:
'Random Forest (estimators:83, max_depth:15, max_features:8,min_samples_leaf:4)'

From the above , we tried Random Forest HP turning and removed some columns post tuning as well. The Best Score we could gain here are the following Train Score: 91.98782014869201% Test Score: 87.1806477469115% RMSE:357.8097448779679

XGB Regressor

In [85]:
xgb = XGBRegressor(learning_rate=0.02, n_estimators=600,
                    silent=False, nthread=1)
params_xgb = {
        'gamma': [0.1,0.5, 1, 1.5, 2, 5],                
        'max_depth': [2,3, 4, 5],
        'max_features':[8,9,10,11],
        'min_samples_leaf':[6,7,8],
        "n_estimators": [50,75,100]
        }

xgb_random = RandomizedSearchCV(estimator = xgb, param_distributions = params_xgb,
               n_iter = 6, cv = 5, verbose=1, random_state=35, n_jobs = 5)
xgb_random.fit(xgs_train,ygs_train)
Fitting 5 folds for each of 6 candidates, totalling 30 fits
/usr/local/lib/python3.7/dist-packages/joblib/externals/loky/process_executor.py:705: UserWarning:

A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.

[11:20:31] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Out[85]:
RandomizedSearchCV(cv=5,
                   estimator=XGBRegressor(learning_rate=0.02, n_estimators=600,
                                          nthread=1, silent=False),
                   n_iter=6, n_jobs=5,
                   param_distributions={'gamma': [0.1, 0.5, 1, 1.5, 2, 5],
                                        'max_depth': [2, 3, 4, 5],
                                        'max_features': [8, 9, 10, 11],
                                        'min_samples_leaf': [6, 7, 8],
                                        'n_estimators': [50, 75, 100]},
                   random_state=35, verbose=1)
In [86]:
print (xgb_random.best_params_)
print (xgb_random.best_score_)
{'n_estimators': 100, 'min_samples_leaf': 6, 'max_features': 11, 'max_depth': 5, 'gamma': 2}
0.7734947769671832
In [87]:
def calc_xgb(x_train, x_test, y_train, y_test,learning_rate=0, n_estimators=0,silent=False, nthread=0,
             min_samples_leaf =  0, max_features =  0, max_depth =  0, gamma =   0, booster = 'gbtree', name = ''):
    xgb = XGBRegressor(learning_rate=learning_rate, n_estimators=n_estimators,silent=silent, nthread=nthread,min_samples_leaf = min_samples_leaf, 
                       max_features =  max_features, max_depth =  max_depth, gamma = gamma, booster  = booster)
    '''
    print(x_train.shape)
    print(x_test.shape)
    print(y_train.shape)
    print(y_test.shape)
    '''
    xgb.fit(x_train, y_train)      
    y_pred_xgb =  xgb.predict(x_test)
    train_score =  xgb.score(x_train, y_train) * 100
    test_score = xgb.score(x_test, y_test) * 100
    rmse = mean_squared_error(y_test, y_pred_xgb, squared=False)
    rmse_val = np.sqrt(rmse)
    mae =mean_absolute_error(y_test, y_pred_xgb)
    r2_score_val = r2_score(y_test, y_pred_xgb)
    r2_score_variance_weighted = r2_score(y_test, y_pred_xgb, multioutput='variance_weighted')
    print('Train Score: {0}%'.format(train_score))
    print('Test Score: {0}%'.format( test_score))
    print('RMSE:{0}'.format(rmse_val))
    print('MAE:{0}'.format(mae))
    print('R Square : {0}'.format(r2_score_val)* 100)
    print('R Square Variance Weighted  :{0}'.format(r2_score_variance_weighted)*100)
    return {'Method':name,'Train Square':train_score,'Test Score':test_score,'RMSE':rmse_val,
            'MAE':mae,'R Square':r2_score_val,'R Square Variance Weighted':r2_score_variance_weighted,
            'Y Pred':y_pred_xgb}
In [88]:
xgb_rand_res  = calc_xgb(xgs_train, xgs_test, ygs_train, ygs_test,learning_rate=0.02, n_estimators=100,silent=False, nthread=0,
             min_samples_leaf =  8, max_features =  11, max_depth =  5, gamma =   0.5, name = 'XGB Random')
[11:20:54] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Train Score: 81.288682964449%
Test Score: 77.64800890548902%
RMSE:411.16363596296964
MAE:98020.77984761739
R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902R Square : 0.7764800890548902
R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902R Square Variance Weighted  :0.7764800890548902
In [89]:
df4 = {'Method':xgb_rand_res.get('Method') ,
                                       'Name':'XG Boost (estimators:100, Learning Rate:0.02, max_features:11,min_samples_leaf:8)',
                                      'Train Score (%)':xgb_rand_res.get('Train Square'),
                                      'Test Score (%)':xgb_rand_res.get('Test Score'),
                                      'R Square':xgb_rand_res.get('R Square'),
                                      'R Square Variance Weighted':xgb_rand_res.get('R Square Variance Weighted')}
                                    

pd_reg_search_results = pd_reg_search_results.append(df3,ignore_index = True)
In [90]:
params_xgb1 = {        
        "n_estimators": [100, 150, 200,250,350,400]
        }

rf_xgb1 = RandomizedSearchCV(estimator = xgb, param_distributions = params_xgb1,n_iter = 6, cv = 5, verbose=1, random_state=35, n_jobs = 5)
rf_xgb1.fit(xgs_train,ygs_train)
Fitting 5 folds for each of 6 candidates, totalling 30 fits
[11:21:43] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Out[90]:
RandomizedSearchCV(cv=5,
                   estimator=XGBRegressor(learning_rate=0.02, n_estimators=600,
                                          nthread=1, silent=False),
                   n_iter=6, n_jobs=5,
                   param_distributions={'n_estimators': [100, 150, 200, 250,
                                                         350, 400]},
                   random_state=35, verbose=1)
In [91]:
print (rf_xgb1.best_params_)
print (rf_xgb1.best_score_)
{'n_estimators': 400}
0.8588816097761566
In [92]:
params_xgb1 = {        
        'min_samples_leaf': [5,6,7]
        }

rf_xgb2 = RandomizedSearchCV(estimator = xgb, param_distributions = params_xgb1,n_iter = 6, cv = 5, verbose=1, random_state=35, n_jobs = 5)
rf_xgb2.fit(xgs_train,ygs_train)
/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_search.py:296: UserWarning:

The total space of parameters 3 is smaller than n_iter=6. Running 3 iterations. For exhaustive searches, use GridSearchCV.

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[11:22:40] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Out[92]:
RandomizedSearchCV(cv=5,
                   estimator=XGBRegressor(learning_rate=0.02, n_estimators=600,
                                          nthread=1, silent=False),
                   n_iter=6, n_jobs=5,
                   param_distributions={'min_samples_leaf': [5, 6, 7]},
                   random_state=35, verbose=1)
In [93]:
print (rf_xgb2.best_params_)
print (rf_xgb2.best_score_)
{'min_samples_leaf': 5}
0.8685954407912085
In [94]:
params_xgb3 = {        
        'max_features': [7,8,9,10,11,12]
        }
        
rf_xgb3 = RandomizedSearchCV(estimator = xgb, param_distributions = params_xgb3,n_iter = 6, cv = 5, verbose=1, random_state=35, n_jobs = 5)
rf_xgb3.fit(xgs_train,ygs_train)
Fitting 5 folds for each of 6 candidates, totalling 30 fits
[11:24:27] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Out[94]:
RandomizedSearchCV(cv=5,
                   estimator=XGBRegressor(learning_rate=0.02, n_estimators=600,
                                          nthread=1, silent=False),
                   n_iter=6, n_jobs=5,
                   param_distributions={'max_features': [7, 8, 9, 10, 11, 12]},
                   random_state=35, verbose=1)
In [95]:
print (rf_xgb3.best_params_)
print (rf_xgb3.best_score_)
{'max_features': 7}
0.8685954407912085
In [96]:
params_xgb4 = {        
        'max_depth': [5,6,8,10]
        }
        
rf_xgb4 = RandomizedSearchCV(estimator = xgb, param_distributions = params_xgb4,n_iter = 6, cv = 5, verbose=1, random_state=35, n_jobs = 5)
rf_xgb4.fit(xgs_train,ygs_train)
/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_search.py:296: UserWarning:

The total space of parameters 4 is smaller than n_iter=6. Running 4 iterations. For exhaustive searches, use GridSearchCV.

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[11:27:45] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Out[96]:
RandomizedSearchCV(cv=5,
                   estimator=XGBRegressor(learning_rate=0.02, n_estimators=600,
                                          nthread=1, silent=False),
                   n_iter=6, n_jobs=5,
                   param_distributions={'max_depth': [5, 6, 8, 10]},
                   random_state=35, verbose=1)
In [97]:
print (rf_xgb4.best_params_)
print (rf_xgb4.best_score_)
{'max_depth': 5}
0.8809672059627932
In [98]:
params_xgb5 = {        
        'gamma': [0.1, 0.5, 1, 1.5, 2, 2.5,5]
        }
        
rf_xgb5 = RandomizedSearchCV(estimator = xgb, param_distributions = params_xgb5,n_iter = 6, cv = 5, verbose=1, random_state=35, n_jobs = 5)
rf_xgb5.fit(xgs_train,ygs_train)
Fitting 5 folds for each of 6 candidates, totalling 30 fits
[11:30:07] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Out[98]:
RandomizedSearchCV(cv=5,
                   estimator=XGBRegressor(learning_rate=0.02, n_estimators=600,
                                          nthread=1, silent=False),
                   n_iter=6, n_jobs=5,
                   param_distributions={'gamma': [0.1, 0.5, 1, 1.5, 2, 2.5, 5]},
                   random_state=35, verbose=1)
In [99]:
print (rf_xgb5.best_params_)
print (rf_xgb5.best_score_)
{'gamma': 1}
0.8685954407912085
In [100]:
#learning_rate=0.025
xgb_tuned_res1 = calc_xgb(xgs_train, xgs_test, ygs_train, ygs_test,learning_rate=0.05, n_estimators=200,silent=False, nthread=0,
             min_samples_leaf =  3, max_features =  7, max_depth =  6, gamma =   1             ,name = 'XGB Tuned 1')
[11:30:25] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Train Score: 95.24626655175548%
Test Score: 90.2081275487969%
RMSE:334.504589593429
MAE:67923.5902729586
R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969R Square : 0.902081275487969
R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969R Square Variance Weighted  :0.902081275487969
In [101]:
df5 = {'Method':xgb_tuned_res1.get('Method') ,
                                      'Name':'XG Boost (estimators:200, Learning Rate:0.05, max_features:7,min_samples_leaf:3)',
                                      'Train Score (%)':xgb_tuned_res1.get('Train Square'),
                                      'Test Score (%)':xgb_tuned_res1.get('Test Score'),
                                      'R Square':xgb_tuned_res1.get('R Square'),
                                      }
                                    

pd_reg_search_results = pd_reg_search_results.append(df5,ignore_index = True)
In [102]:
xgb_tuned_res2 = calc_xgb(xgs_train, xgs_test, ygs_train, ygs_test,learning_rate=0.02, n_estimators=200,silent=False, nthread=0,
             min_samples_leaf =  5, max_features = 7 , max_depth =  6, gamma =   2, name = 'XGB Tuned 2')
[11:30:36] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Train Score: 93.10966170708372%
Test Score: 88.72897022799388%
RMSE:346.4787015824462
MAE:71984.54871833796
R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389R Square : 0.8872897022799389
R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389R Square Variance Weighted  :0.8872897022799389
In [103]:
df6 = {'Method':xgb_tuned_res2.get('Method') ,
                                      'Name':'XG Boost (estimators:200, Learning Rate:0.02, max_features:7,min_samples_leaf:5)',
                                      'Train Score (%)':xgb_tuned_res2.get('Train Square'),
                                      'Test Score (%)':xgb_tuned_res2.get('Test Score'),
                                      'R Square':xgb_tuned_res2.get('R Square'),
                                      }
                                    

pd_reg_search_results = pd_reg_search_results.append(df6,ignore_index = True)

Lets try with reduced feature with 2 columns removed

In [104]:
xgb_tuned_res3 = calc_xgb(xgs_train1, xgs_test1, ygs_train1, ygs_test1,learning_rate=0.02, n_estimators=200,silent=False, nthread=0,
             min_samples_leaf =  5, max_features =  11, max_depth =  6, gamma =   2, name = 'XGB Tuned 3')
[11:30:45] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Train Score: 92.29534664009536%
Test Score: 88.20349094649166%
RMSE:350.44835061549594
MAE:74110.12492048346
R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166R Square : 0.8820349094649166
R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166R Square Variance Weighted  :0.8820349094649166
In [105]:
df7 = {'Method':xgb_tuned_res3.get('Method') ,
                                      'Name':'Reduced Columns XG Boost (estimators:200, Learning Rate:0.02, max_features:11,min_samples_leaf:5)',
                                      'Train Score (%)':xgb_tuned_res3.get('Train Square'),
                                      'Test Score (%)':xgb_tuned_res3.get('Test Score'),
                                      'R Square':xgb_tuned_res3.get('R Square'),
                                      }
                                    

pd_reg_search_results = pd_reg_search_results.append(df7,ignore_index = True)
In [106]:
xgb_tuned_res4 = calc_xgb(xgs_train2, xgs_test2, ygs_train2, ygs_test2,learning_rate=0.03, n_estimators=200,silent=False, nthread=0,
             min_samples_leaf =  5, max_features =  7, max_depth =  6, gamma =   2, name = 'XGB Tuned 4')
[11:30:55] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Train Score: 92.55355463618%
Test Score: 87.87996505154095%
RMSE:352.82683752017704
MAE:74194.82295642493
R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095R Square : 0.8787996505154095
R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095R Square Variance Weighted  :0.8787996505154095
In [107]:
df8 = {'Method':xgb_tuned_res3.get('Method') ,
                                      'Name':'Reduced Columns XG Boost (estimators:200, Learning Rate:0.03, max_features:7,min_samples_leaf:5)',
                                      'Train Score (%)':xgb_tuned_res4.get('Train Square'),
                                      'Test Score (%)':xgb_tuned_res4.get('Test Score'),
                                      'R Square':xgb_tuned_res4.get('R Square')                                      
                                      }
                                    

pd_reg_search_results = pd_reg_search_results.append(df8,ignore_index = True)
In [108]:
xgb_tuned_res5 = calc_xgb(xgs_train, xgs_test, ygs_train, ygs_test,learning_rate=0.05, n_estimators=100,silent=False, nthread=0,
             min_samples_leaf =  6, max_features =  8, max_depth =  6, gamma =   0.1             ,name = 'XGB Tuned 1')
[11:31:07] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Train Score: 93.76295566278385%
Test Score: 89.50532970847715%
RMSE:340.3516369593573
MAE:70626.36108821999
R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714R Square : 0.8950532970847714
R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714R Square Variance Weighted  :0.8950532970847714
In [109]:
xgb_tuned_res5 = calc_xgb(xgs_train1, xgs_test1, ygs_train1, ygs_test1,learning_rate=0.025, n_estimators=100,silent=False, nthread=0,
             min_samples_leaf =  6, max_features =  8, max_depth =  6, gamma =   0.1             ,name = 'XGB Tuned 1')
[11:31:10] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Train Score: 88.38213557475922%
Test Score: 84.99116451893029%
RMSE:372.1965928957082
MAE:81734.98012809391
R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303R Square : 0.849911645189303
R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303R Square Variance Weighted  :0.849911645189303
In [110]:
pd_reg_search_results.sort_values('Test Score (%)',ascending=False)
Out[110]:
Method Name Train Score (%) Test Score (%) R Square
4 XGB Tuned 1 XG Boost (estimators:200, Learning Rate:0.05, ... 95.25 90.21 0.90
5 XGB Tuned 2 XG Boost (estimators:200, Learning Rate:0.02, ... 93.11 88.73 0.89
6 XGB Tuned 3 Reduced Columns XG Boost (estimators:200, Lear... 92.30 88.20 0.88
7 XGB Tuned 3 Reduced Columns XG Boost (estimators:200, Lear... 92.55 87.88 0.88
0 RF Train Random Forest (estimators:83, max_depth:15, ma... 92.05 87.03 0.87
1 RF Train2 Features Removed Random Forest (estimators:83,... 92.05 86.90 0.87
2 RF Train3 Multi collinear removed frmo Random Forest(est... 91.87 86.74 0.87
3 RF Train3 Multi collinear removed frmo Random Forest(est... 91.87 86.74 0.87

Combining the datasets for visualization with loop as tuples.

In [111]:
datasets_for_visualization = [(rf_result_random  ,ygs_test),
(rf_res_reduced_train, ygs_test1),
(xgb_rand_res, ygs_test),
(xgb_tuned_res1, ygs_test),
(xgb_tuned_res2, ygs_test),
(xgb_tuned_res3 , ygs_test1)           
]
In [112]:
for res_dict, y_actual in datasets_for_visualization:
  
  plt.figure(figsize=(14,8))
  sns.lineplot(x = range(len(y_actual)),y = y_actual,color='yellow',linewidth=1.5)
  sns.lineplot(x = range(len(res_dict.get('Y Pred'))),y = res_dict.get('Y Pred'),color='black',linewidth=.5)
  plt.title('Actual and Predicted for {0}'.format(res_dict.get('Method')), fontsize=20)     
  plt.xlabel('Index', fontsize=10)                           # X-label
  plt.ylabel('Values', fontsize=10)
In [113]:
GBR_test=GradientBoostingRegressor(random_state=22)
In [114]:
param_grid1 = {'n_estimators': [50,400,500, 900]}
grid_search1 = GridSearchCV(estimator = GBR_test, param_grid = param_grid1, cv = 3, n_jobs = 2, verbose = 1)
grid_search1.fit(xgs_train,ygs_train)
Fitting 3 folds for each of 4 candidates, totalling 12 fits
Out[114]:
GridSearchCV(cv=3, estimator=GradientBoostingRegressor(random_state=22),
             n_jobs=2, param_grid={'n_estimators': [50, 400, 500, 900]},
             verbose=1)
In [115]:
print (grid_search1.best_params_)
print (grid_search1.best_score_)
{'n_estimators': 900}
0.8812135443274706
In [116]:
param_grid2 = {'n_estimators': [ 900,  1200, 1300, 1400]}
grid_search2 = GridSearchCV(estimator = GBR_test, param_grid = param_grid2, cv = 3, n_jobs = 2, verbose = 1)
grid_search2.fit(xgs_train,ygs_train)
Fitting 3 folds for each of 4 candidates, totalling 12 fits
Out[116]:
GridSearchCV(cv=3, estimator=GradientBoostingRegressor(random_state=22),
             n_jobs=2, param_grid={'n_estimators': [900, 1200, 1300, 1400]},
             verbose=1)
In [117]:
print (grid_search2.best_params_)
print (grid_search2.best_score_)
{'n_estimators': 1200}
0.8813882342305893
In [ ]:
param_grid3 = {
    'learning_rate': [0.1,0.25],
    'max_depth': [5,5],
    'min_samples_leaf': [5,8],
    'min_samples_split': [40,50],
    'n_estimators': [200, 500, 800, 1200],
}
GBR_test=GradientBoostingRegressor(random_state=22)

grid_search3 = GridSearchCV(estimator = GBR_test, param_grid = param_grid3, 
                          cv = 5, n_jobs = 3, verbose = 1)
grid_search3.fit(xgs_train,ygs_train)
In [ ]:
print (grid_search3.best_params_)
print (grid_search3.best_score_)
In [120]:
def calc_gb(x_train, x_test, y_train, y_test,learning_rate=0, n_estimators=0,silent=False, nthread=0,
             min_samples_leaf =  0,  max_depth =  0, name = ''):
    xgb = XGBRegressor(learning_rate=learning_rate, n_estimators=n_estimators,silent=silent, nthread=nthread,min_samples_leaf = min_samples_leaf, 
                        max_depth =  max_depth)
    print(x_train.shape)
    print(x_test.shape)
    print(y_train.shape)
    print(y_test.shape)

    xgb.fit(x_train, y_train)      
    y_pred_xgb =  xgb.predict(x_test)
    train_score =  xgb.score(x_train, y_train) * 100
    test_score = xgb.score(x_test, y_test) * 100
    rmse = mean_squared_error(y_test, y_pred_xgb, squared=False)
    rmse_val = np.sqrt(rmse)
    mae =mean_absolute_error(y_test, y_pred_xgb)
    r2_score_val = r2_score(y_test, y_pred_xgb)
    r2_score_variance_weighted = r2_score(y_test, y_pred_xgb, multioutput='variance_weighted')
    print('Train Score: {0}%'.format(train_score))
    print('Test Score: {0}%'.format( test_score))
    print('RMSE:{0}'.format(rmse_val))
    print('MAE:{0}'.format(mae))
    print('R Square : {0}'.format(r2_score_val)* 100)
    print('R Square Variance Weighted  :{0}'.format(r2_score_variance_weighted)*100)
    return {'Method':name,'Train Square':train_score,'Test Score':test_score,'RMSE':rmse_val,
            'MAE':mae,'R Square':r2_score_val,'R Square Variance Weighted':r2_score_variance_weighted,
            'Y Pred':y_pred_xgb}
In [121]:
gb_tuned_res1 = calc_gb(xgs_train, xgs_test, ygs_train, ygs_test,learning_rate=0.001, n_estimators=1200,silent=False, nthread=0,
             min_samples_leaf =  5,  max_depth =  7,  name = 'GB Tuned 1')
(17290, 14)
(4323, 14)
(17290,)
(4323,)
[12:09:05] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Train Score: 59.80680456170169%
Test Score: 55.82653849033782%
RMSE:487.50155143880494
MAE:166338.63475884803
R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782R Square : 0.5582653849033782
R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782R Square Variance Weighted  :0.5582653849033782
In [122]:
gb_tuned_res1 = calc_gb(xgs_train1, xgs_test1, ygs_train1, ygs_test1,learning_rate=0.1, n_estimators=1200,silent=False, nthread=0,
             min_samples_leaf =  5,  max_depth =  5,  name = 'GB Tuned 2')
(17290, 12)
(4323, 12)
(17290,)
(4323,)
[12:09:42] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Train Score: 98.35029086246713%
Test Score: 89.73816981329239%
RMSE:338.4479203873019
MAE:69072.22261558814
R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239R Square : 0.8973816981329239
R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239R Square Variance Weighted  :0.8973816981329239
In [123]:
gb_tuned_res2 = calc_gb(xgs_train2, xgs_test2, ygs_train2, ygs_test2,learning_rate=0.1, n_estimators=1200,silent=False, nthread=0,
             min_samples_leaf =  5,  max_depth =  5,  name = 'GB Tuned 3')
(17290, 10)
(4323, 10)
(17290,)
(4323,)
[12:09:57] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Train Score: 97.91503918582785%
Test Score: 88.15088095711371%
RMSE:350.83843062348916
MAE:70515.29252327666
R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372R Square : 0.8815088095711372
R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372R Square Variance Weighted  :0.8815088095711372
In [124]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

num_folds = 40
seed = 1001

xgb = XGBRegressor(learning_rate=0.02, n_estimators=200,silent=False, nthread=0,
             min_samples_leaf =  5, max_features =  11, max_depth =  6, gamma =   2)

kfold = KFold(n_splits=num_folds)
results = cross_val_score(xgb, xgs, ygs, cv=kfold)
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
[12:10:15] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:10:20] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:10:24] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:10:29] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:10:33] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:10:37] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:10:42] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:10:46] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:10:51] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:10:55] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:10:59] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:11:04] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:11:09] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:11:13] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:11:17] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:11:22] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:11:26] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:11:30] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:11:35] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:11:39] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:11:44] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:11:48] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:11:52] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:11:57] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:12:01] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:12:05] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:12:10] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:12:14] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:12:18] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:12:23] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:12:27] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:12:31] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:12:36] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:12:40] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:12:45] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:12:49] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:12:53] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:12:58] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:13:03] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[12:13:07] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[0.90379677 0.90391921 0.79454755 0.89698838 0.88983258 0.89747905
 0.90492602 0.87562231 0.91281305 0.8850368  0.84160655 0.88499345
 0.883583   0.87885018 0.88522659 0.89028308 0.87652394 0.90871983
 0.87001349 0.83504145 0.87936096 0.87557745 0.89371786 0.88775706
 0.87731032 0.88505692 0.86722249 0.8417517  0.85967331 0.85685236
 0.89059988 0.83089964 0.9036672  0.85829657 0.86664766 0.8712791
 0.85977767 0.89014825 0.84425577 0.86120628]
Accuracy: 87.552% (2.424%)
In [125]:
from matplotlib import pyplot
# plot scores
pyplot.hist(results)
pyplot.show()
# confidence intervals
alpha = 0.95                             # for 95% confidence 
p = ((1.0-alpha)/2.0) * 100              # tail regions on right and left .25 on each side indicated by P value (border)
lower = max(0.0, np.percentile(results, p))  
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(results, p))
print('confidence interval of {0}% occurs between {1} and {2}' .format (alpha*100, lower*100, upper*100))
confidence interval of 95.0% occurs between 82.99908339166733 and 90.88221649599521
  • The ensemble models have performed well compared to that of
  1. List item
  2. List item

linear,KNN models

  • The best performance is given by XG Boost model with training (Score – 95.07%), Testing (score-89.85%, RMSE-337.54)

Pickle save and load model for deployment

XGB Regressor:

In [126]:
xgb11 = XGBRegressor(learning_rate=0.05, n_estimators=200,silent=False, nthread=0,min_samples_leaf = 3, 
                       max_features = 7, max_depth = 6, gamma = 1)
xgb11.fit(xgs_train, ygs_train)
[12:13:50] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Out[126]:
XGBRegressor(gamma=1, learning_rate=0.05, max_depth=6, max_features=7,
             min_samples_leaf=3, n_estimators=200, nthread=0, silent=False)
In [127]:
#New version compatibility of model with the older version
xgb11.save_model('xgb_model22.bin')
In [128]:
xgs_train.shape, xgs_train.columns
Out[128]:
((17290, 14), Index(['room_bath', 'furnished', 'quality', 'yr_built', 'sight',
        'yr_renovated', 'lat', 'coast', 'ceil_measure', 'lot_measure15', 'long',
        'living_measure15', 'living_measure', 'zipcode'],
       dtype='object'))
In [129]:
xgb11.predict(xgs_test)
Out[129]:
array([ 448603.03,  470206.53,  464114.78, ...,  363742.72, 1266080.1 ,
        312500.03], dtype=float32)

RF Regressor:

In [130]:
rfg22 = RandomForestRegressor(n_estimators=83, max_depth=15, max_features=8,min_samples_leaf=4)

#(estimators:83, max_depth:15, max_features:8,min_samples_leaf:4)

rfg22.fit(xgs_train2, ygs_train2)
Out[130]:
RandomForestRegressor(max_depth=15, max_features=8, min_samples_leaf=4,
                      n_estimators=83)
In [131]:
import pickle
with open('rfg_model22.pkl','wb') as k:
  pickle.dump(rfg22,k)
In [132]:
xgs_train2.shape, xgs_train2.columns
Out[132]:
((17290, 10),
 Index(['room_bath', 'furnished', 'quality', 'yr_built', 'yr_renovated', 'lat',
        'coast', 'long', 'living_measure', 'zipcode'],
       dtype='object'))
In [ ]: